###########################################
# Direct vs. indirect rule and the size of subnational governance units
#
# Replication Material for:
# Continuity or Change? (In)direct Rule in British and French Colonial Africa
# 
# Carl Mueller-Crepon, 2020
# International Organization
#
# File Description:
# Main analysis of the size of districts in British and French colonies.
#
# Called from indrule_analysis_all.R 
#
################################

rm(list = ls())

# INITIALIZE

# Libraries
library(lfe)
library(stargazer)
library(plyr)
library(ggplot2)
library(viridisLite)
library(maptools)
library(rgeos)
library(raster)
library(MASS)
library(lme4)
library(rgeos)

# Paths
fig.path = "figures/governance/"
tab.path = "tables/governance/"

# Output type
output.type <- "latex"

##### LaTeX Functionalities
source("scripts/functions/analysis_functions.R")

# Balance table
source("scripts/functions/balance_table.R")

# Load data
load("data/data_governance.RData")

# Some variable creation

# ... French colony dummy
data.dist$colfrnc = ifelse(data.dist$colbrit == 1, 0, 1)

# ... main interaction terms
data.dist$v33.num_colfrnc <- data.dist$v33.num * data.dist$colfrnc
data.dist$has.ruler.1885_colfrnc <- data.dist$has.ruler.1885 * data.dist$colfrnc
data.dist$ruler.1885_colfrnc <- data.dist$ruler.1885 * data.dist$colfrnc
data.dist$has.ruler.1920_colfrnc <- data.dist$has.ruler.1920 * data.dist$colfrnc
data.dist$has.ruler.1900_colfrnc <- data.dist$has.ruler.1900 * data.dist$colfrnc
data.dist$has.any.ruler_colfrnc <- data.dist$has.any.ruler * data.dist$colfrnc
data.dist$province <- paste(data.dist$province,".", data.dist$colony)

data.dist$river.dist.ln <- log(data.dist$river.dist)
data.dist$river.dist.ln_colfrnc <- log(data.dist$river.dist) * data.dist$colfrnc


data.dist$coast.dist.ln <- log(data.dist$coast.dist + 0.1)
data.dist$coast.dist.ln_colfrnc <- log(data.dist$coast.dist + 0.1) * data.dist$colfrnc

data.dist$popdens.ln <- log(data.dist$pop.1880 / data.dist$area)
data.dist$popdens.ln_colfrnc <- log(data.dist$pop.1880 / data.dist$area) * data.dist$colfrnc

data.dist$group.popdens.ln <- log((data.dist$group.pop.1880+1) / data.dist$group.area ) 
data.dist$group.popdens.ln_colfrnc <- log((data.dist$group.pop.1880+1)/ data.dist$group.area ) * data.dist$colfrnc

# 
# # Border-segments x distance to coast
# data.dist$dist.bucket <- cut(data.dist$coast.dist, seq(0,100, by = 2))


##########################################
# MODEL SETUP
##########################################

# ... dependent variable
dep.var <- c("log(area)")


# ... controls with interaction
base.controls <- c("popdens.ln", 
                   "group.popdens.ln","coast.dist.ln", "river.dist.ln")

base.controls.int <- c(base.controls, paste0(base.controls, "_colfrnc"))
ethnic.controls <- paste0("v",c(4,5,28),".num")
ethnic.controls.int <- c(ethnic.controls,  paste0(ethnic.controls,"*colfrnc"))
nature.controls <- c("medianaltitude","medianslope","temperaturemean","evapotranspiration",  
                     "precipitation", "ppetratio","suit", "max.cash.crop" )
nature.controls.int <- c(nature.controls,  paste0(nature.controls,"*colfrnc"))

# ... fixed effects
fe.cross = " colony "

# ... SE cluster on spatial unit
se.cluster <- " province "

# ... main explanatory terms:
main.expl <- c("v33.num","v33.num_colfrnc")

# ... variables to keep and labels
keep.controls <- c("v33.num", "v33.num_colfrnc")
keep.labels <- c("Precol. centralization", "Precol. centralization $\\times$ French")


# ... notes below tables
notes.p <- "$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01."
latex.notes.add <- "Standard errors are clustered on the province-level. All covariates are included also in interaction with `French rule'."

latex.notes.add <- function(width = .95, cluster = "province-level"){
  paste0("\\parbox[t]{",width,"\\textwidth}{\\textit{Notes:} OLS models. Standard errors are clustered on the ", cluster, ".
         Baseline controls include the local population density, ethnic groups'  population density, 
         and the distance to the coast as well as the closest navigable river. 
         Nature controls consist of the local altitude and slope, mean annual temperature, precipitation
         and evapotranspiration, the ratio of the two, agricultural suitability, and soils' suitability for cash crop production. 
         Ethnic controls are the reliance on agriculture and pastoralism, as well as the intensity of agricultural activities. 
         Additionally, all covariates are interacted with `French rule'.
         Significance codes: $^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01.}")}


################################################
# Colonial district maps
# Appendix Figures A3 and A4
################################################


# All Africa
library(cshapes)
all.africa <- cshp(date = as.Date("2000-01-01"))
all.africa <- all.africa[!all.africa$COWCODE %in% c(580, 581, 590, 402) & all.africa$COWCODE %in% c(402:626, 651),]
all.africa <- gUnaryUnion(all.africa, id = rep(1, length(all.africa)))


# Make colony borders
col.spdf <- gUnaryUnion(gBuffer(dist.spdf, width = 0, byid = T), id = dist.spdf$colony)
east.colonies <- c("Nyasaland",  "Tanzania", "Kenya","Uganda","Northern Rhodesia")

# ... West Africa
png(paste0(fig.path,"distmap_west.png"), width = 7, height = 5, unit = "in", res = 600)
par(mar = c(0,0,0,0))
plot(dist.spdf[!dist.spdf$colony %in% east.colonies,])
plot(all.africa, add = T, border = "grey")
plot(dist.spdf[!dist.spdf$colony %in% east.colonies,], add = T)
plot(col.spdf[!row.names(col.spdf) %in% east.colonies,], lwd = 2, border = "black", add = T)

# calculate position of inset
plotdim <- par("plt")
xleft    = plotdim[2] - (plotdim[2] - plotdim[1]) * 0.2
xright   = plotdim[2]  #
ybottom  = plotdim[4] - (plotdim[4] - plotdim[3]) * 0.18  #
ytop     = plotdim[4]  #

# set position for inset
par(fig = c(xleft, xright, ybottom, ytop) , mar=c(0,0,0,0) , new=TRUE)
plot(all.africa, axes=FALSE, xlab="", ylab="", border = "grey", ylim = c(-34, 37.34041))
plot(as(extent(dist.spdf[!dist.spdf$colony %in% east.colonies,]), "SpatialPolygons"), axes=FALSE, xlab="", ylab="",
     add = T)
dev.off()

# ... South-East Africa
png(paste0(fig.path,"distmap_east.png"), width = 7, height = 5, unit = "in", res = 600)
par(mar = c(0,0,0,0))
plot(dist.spdf[dist.spdf$colony %in% east.colonies,])
plot(all.africa, add = T, border = "grey")
plot(dist.spdf[dist.spdf$colony %in% east.colonies,], add = T)
plot(col.spdf[row.names(col.spdf) %in% east.colonies,], lwd = 2, border = "black", add = T)

# calculate position of inset
plotdim <- par("plt")
xleft    = plotdim[1] + (plotdim[2] - plotdim[1]) * 0.2
xright   = plotdim[1] + (plotdim[2] - plotdim[1]) * 0.4 #
ybottom  = plotdim[2] - (plotdim[4] - plotdim[3]) * 0.23 #
ytop     = plotdim[2] - (plotdim[4] - plotdim[3]) * 0.03 #

# set position for inset
par(fig = c(xleft, xright, ybottom, ytop) , mar=c(0,0,0,0) , new=TRUE)
plot(all.africa, axes=FALSE, xlab="", ylab="", border = "grey", ylim = c(-34, 37.34041))
plot(as(extent(dist.spdf[dist.spdf$colony %in% east.colonies,]), "SpatialPolygons"), axes=FALSE, xlab="", ylab="",
     add = T)
dev.off()


# ... Regression-discontinuity map
colony.rd <- unique(data.dist$colony[!is.na(data.dist$fbperp.dist)])

png(paste0(fig.path,"distmap_rd.png"), width = 7, height = 3, unit = "in", res = 600)
par(mar = c(0,0,0,0))
plot(dist.spdf[dist.spdf$colony %in% colony.rd,],
     col = c("red","blue")[dist.spdf$colbrit[dist.spdf$colony %in% colony.rd]+1],
     border = "white")
plot(all.africa, add = T, border = "grey")
plot(dist.spdf[dist.spdf$colony %in% colony.rd,],
     col = c("red","blue")[dist.spdf$colbrit[dist.spdf$colony %in% colony.rd]+1],
     border = "white", add = T)

# calculate position of inset
plotdim <- par("plt")
xleft    = plotdim[1] + (plotdim[2] - plotdim[1]) * 0.0
xright   = plotdim[1] + (plotdim[2] - plotdim[1]) * 0.3 #
ybottom  = plotdim[2] - (plotdim[4] - plotdim[3]) * 0.3 #
ytop     = plotdim[2] - (plotdim[4] - plotdim[3]) * 0.0 #

# set position for inset
par(fig = c(xleft, xright, ybottom, ytop) , mar=c(0,0,0,0) , new=TRUE)
plot(all.africa, axes=FALSE, xlab="", ylab="", border = "grey", ylim = c(-34, 37.34041))
plot(as(extent(dist.spdf[dist.spdf$colony %in% colony.rd,]), "SpatialPolygons"), axes=FALSE, xlab="", ylab="",
     add = T)
dev.off()



################################################
# Sources
# Appendix Table A6
################################################

# Make base
sources.df <- aggregate.data.frame(list(Observations = data.dist$geo.match),
                                   list(colony = data.dist$colony), FUN = length)

# Recode colony names
old <- c("CDI", "DAH", "GUI", "HV", "MAU","NIG", "SEN", "SOU", "Tanzania")
new <- c("Côte d'Ivoire", "Dahomey", "Guinée", "Haute Volta", "Mauretanie", 
         "Niger", "Senegal", "Soudan", "Tanganyika")
for(i in c(1:length(old))){
  sources.df$colony[sources.df$colony == old[i]] <- new[i]
}


# Encode sources and years:
sources.df <- join(sources.df,
                   read.csv("data/geo_districts_sources.csv", sep = ";", stringsAsFactors = F),
                   by = "colony")
# Make sure LaTeX works
sources.df$source <- gsub("&","and", sources.df$source, fixed = T)

# Print to table
fileConn<-file(paste0(tab.path, "map.sources.tex"))
writeLines(
  stargazer(sources.df,
            title="Summary of sources of district maps",
            notes.align = "l",label=paste0("tab.map.sources"),align =F,
            digits = 0, summary = F, rownames = F,digit.separate = 0,
            type  = output.type), 
  fileConn)
close(fileConn)

###############################################
# Summary statistics
# Appendix Table A7
###############################################

# Select variables
sum.vars <- c("colfrnc","area", main.expl[1], base.controls, nature.controls, ethnic.controls)
sum.stats.df <- data.dist[,sum.vars]
sum.var.labs <- c("French", "Area", "Precolonial centralization","Population density (log)", "Ethnic groups' population density (log)",
                  "Distance to coast", "Distance to nav. river","Median altitude",
                  "Median slope","Evapotranspiration","Precipitation","Evapotranspiration/Precipitation",
                  "Mean temperature","Agricultural suitability","Cash crop suitability","Reliance on agriculture",
                  "Reliance on pastoralism", "Intensity of agriculture")

# Print to table
fileConn<-file(paste0(tab.path, "govscale.sumtab.tex"))
writeLines(
  stargazer(sum.stats.df,
            title="Summary of district-area data",
            covariate.labels = sum.var.labs,
            notes.align = "l",label=paste0("tab.data.summary.govscale"),align =F,
            digits = 3,  rownames = F,digit.separate = 0,
            type  = output.type), 
  fileConn)
close(fileConn)




################################################
# Main model, comparing British and French rule
# Table 5, main paper
# Figure 8, main paper
################################################

stub <- "govscale.main"

# Specification
add.vars <- list(c(base.controls.int),
                 c(base.controls.int, nature.controls.int),
                 c(base.controls.int, nature.controls.int, ethnic.controls.int))

# Estimation
this.m <- lapply(add.vars, function(av){
  return(felm(make_form(dv = dep.var, expl = c(main.expl, av),
                        iv = "0", fe = fe.cross, se =  se.cluster), data = data.dist[,]))
})



# Output

# ... info
mean.dv <- latex.any.row("Mean DV", round_any(unlist(lapply(this.m, function(m){mean(m$response)})), 0.01) )
add.lines <- list(latex.any.row("Colony FE: ",rep("yes",length(this.m))),
                  latex.any.row("Baseline controls: ",rep("yes",length(this.m))),
                  latex.any.row("Nature controls: ",c("no","yes","yes")),
                  latex.any.row("Ethnic controls: ",c("no","no","yes")),
                  mean.dv)

# ... stargazer
fileConn<-file(paste0(tab.path, stub, ".tex"))
writeLines(stargazer(this.m,
                     title="Precolonial centralization and the size of districts",
                     dep.var.caption = "",dep.var.labels.include = FALSE,
                     column.labels = c("log(District Area)"),column.separate = c(3),
                     keep= keep.controls, covariate.labels = keep.labels,
                     notes.align = "l",label=paste0("tab.",stub),align =T,
                     add.lines = add.lines, 
                     digits = 3, intercept.top = T,intercept.bottom = F,
                     omit.stat = c("rsq","res.dev","ser"),
                     notes = latex.notes.add(.8), notes.label = "", notes.append = F,
                     type  = output.type),
           fileConn)
close(fileConn)

# Save model
saveRDS(this.m, file = "data/govscale_main.rds")

# Effect sizes:
m <- this.m[[3]]
frnc.effect <- c(coef = sum(m$coefficients[c(1,2),1]),
                 sd = (m$clustervcv)^0.5)

# Figure
# Data Frame for Plot
plot.df <- data.frame(do.call(rbind,lapply(this.m, function(m){
  x <- c(coef.b = m$coefficients[1], se.b = diag(m$clustervcv)[1]^.5, 
         coef.f = m$coefficients[1] + m$coefficients[2], se.f = (diag(m$clustervcv)[1] + diag(m$clustervcv)[2] + 2*m$clustervcv[1,2] )^.5,
         coef.d = m$coefficients[2], se.d = diag(m$clustervcv)[2]^.5)
  x <- matrix(x, nrow = 3, byrow = T)
  colnames(x) <- c("coef","se")
  x
})))

# ... Add names

plot.df$Specification <- rep(c("Baseline", "+ nature controls","+ nature & ethnic\n controls"), each = 1*3)
plot.df$pos <- rep(rep(c(3:1),each = 3), 1) + rep(c(.15,0,-.15), 1*3)
plot.df$Empire <- rep(c("British", "French", "French - British"), 1*3)

# Plot
stub <- "govscale_main"
png(paste0(fig.path,stub,".png"), width = 5, height = 3, unit = "in", res = 600)
g <- ggplot(plot.df, aes(y = pos, x = coef, col = Empire)) + geom_point(aes(col = Empire, pch = Empire)) + 
  geom_errorbarh(aes(xmin = coef - 1.96*se, 
                     xmax = coef + 1.96*se,
                     col= Empire), 
                 height=.1) +
  geom_vline(xintercept = 0, color = "darkgrey", lty = 2) +  
  scale_y_continuous(breaks=c(3:1), labels=unique(plot.df$Specification))  + 
  xlab("Marginal effect for precolonial centralization \n on the size of districts") + ylab(NULL) +
  scale_color_manual(breaks=c("British","French","French - British"),
                     values = c(`British` = "red", `French - British`  = "black", `French`  = "blue")) +
  theme(legend.position = "top") 
print(g)
dev.off()


#######################################
# ROBUSTNESS CHECKS
#######################################

#######################################
# Appendix Table A17
# Single table with:
# 1. Drop outliers
# 2. Weight by colony
# 3. Disease environment
# 4. Captial in 1885
# 6. M&P Murdock matching

# ... Stub
stub <- "govscale.robust1"

# ... Make empty list
this.m <- list()


# ... Drop outliers
this.m <- c(this.m, 
            list(felm(make_form(dv = dep.var, expl = c(main.expl, base.controls.int, nature.controls.int, ethnic.controls.int),
                                iv = "0", fe = fe.cross, se =  se.cluster), 
                      data = data.dist[data.dist[,"area"] > quantile(data.dist[,"area"],.025, na.rm = T) &
                                          data.dist[,"area"] < quantile(data.dist[,"area"], .975, na.rm = T),])))

# ... Weight by colony
form <- make_form(dv = dep.var, expl = c(main.expl, base.controls.int, nature.controls.int, ethnic.controls.int),
                  iv = "0", fe = fe.cross, se =  se.cluster)
this.data <- na.omit(data.dist[,all.vars(form)] )
this.data$weights <- gen_weights(this.data$colony)
this.m <- c(this.m, 
            list(felm(form, 
                      data = this.data, weights = this.data$weights)))


# ... Additional controls: desease environment
desease.vars <- c("malaria.tsuit.max","tsetse.max")
desease.vars <- paste0(desease.vars, "*colfrnc")
this.m <- c(this.m, 
            list(felm(make_form(dv = dep.var, expl = c(main.expl, desease.vars, base.controls.int, nature.controls.int, ethnic.controls.int),
                                iv = "0", fe = fe.cross, se =  se.cluster), 
                      data = data.dist)))

# ... Ruler in 1885
this.m <- c(this.m, 
            list(felm(make_form(dv = dep.var, expl = c("has.ruler.1885 + has.ruler.1885_colfrnc", 
                                                       base.controls.int, nature.controls.int, ethnic.controls.int),
                                iv = "0", fe = fe.cross, se =  se.cluster), 
                      data = data.dist)))

# ... M&P Matching
data.dist$v33.num.mp_colfrnc <- data.dist$v33.num.mp * data.dist$colfrnc
this.m <- c(this.m, 
            list(felm(make_form(dv = dep.var, expl = c("v33.num.mp + v33.num.mp_colfrnc", 
                                                       base.controls.int, nature.controls.int, ethnic.controls.int),
                                iv = "0", fe = fe.cross, se =  se.cluster), 
                      data = data.dist)))

# Output

# ... info
mean.dv <- latex.any.row("Mean DV", round_any(unlist(lapply(this.m, function(m){mean(m$response)})), 0.01) )
add.lines <- list(latex.any.row("Colony FE: ",rep("yes",length(this.m))),
                  latex.any.row("Colony weights: ",c("no","yes","no","no","no")),
                  latex.any.row("Desease controls: ",c("no","no","yes","no","no")),
                  latex.any.row("Baseline controls: ",rep("yes",length(this.m))),
                  latex.any.row("Nature controls: ",rep("yes",length(this.m))),
                  latex.any.row("Ethnic controls: ",rep("yes",length(this.m))),
                  mean.dv)

# ... stargazer
this.keep.controls <- c("v33.num", "v33.num_colfrnc",
                   "has.ruler.1885", "has.ruler.1885_colfrnc",
                   "v33.num.mp", "v33.num.mp_colfrnc")
this.keep.labels <- c("Precol. centralization", "Precol. centr. $\\times$ French",
                      "Capital 1885", "Capital 1885 $\\times$ French",
                      "Precol. centr. (MP)", "Precol. centr. (MP) $\\times$ French")

# Save
fileConn<-file(paste0(tab.path, stub, ".tex"))
writeLines(stargazer(this.m,
                     title="Precolonial centralization and the size of districts: Robustness checks",
                     dep.var.caption = "log(District Area)",dep.var.labels.include = FALSE,
                     column.labels = c("No outlier", "Col.-weight", "Disease","Cap. 1885","Alt. PCC"),
                     keep= this.keep.controls , covariate.labels = this.keep.labels,
                     notes.align = "l",label=paste0("tab.",stub),align =T,
                     add.lines = add.lines, 
                     digits = 2, intercept.top = T,intercept.bottom = F,
                     omit.stat = c("rsq","res.dev","ser"),column.sep.width = "-5pt",
                     notes = latex.notes.add(), notes.label = "", notes.append = F,
                     type  = output.type),
           fileConn)
close(fileConn)



######################################
# Colony-Jackknife
# Appendix Figure A11
######################################

stub <- "govscale.coljack"

# ... full specification 
add.vars <- list(c(base.controls.int),
                 c(base.controls.int, nature.controls.int),
                 c(base.controls.int, nature.controls.int, ethnic.controls.int))

# ... colonies / countries
colonies <- c("Baseline", na.omit(unique(data.dist$colony)))

# ... estimation
this.m <- lapply(add.vars, function(av){
  lapply(colonies, function(c){
    return(felm(make_form(dv = dep.var, expl = c(main.expl, av),
                          iv = "0", fe = fe.cross, se =  se.cluster), 
                data = data.dist[data.dist$colony != c,]))
  })
})

# ... get coefficients
plot.df <- data.frame(do.call(rbind,lapply(unlist(this.m, recursive = F), function(m){
  x <- c(coef.b = m$coefficients[1], se.b = diag(m$clustervcv)[1]^.5, 
         coef.f = m$coefficients[1] + m$coefficients[2], se.f = (diag(m$clustervcv)[1] + diag(m$clustervcv)[2] + 2*m$clustervcv[1,2] )^.5,
         coef.d = m$coefficients[2], se.d = diag(m$clustervcv)[2]^.5)
  x <- matrix(x, nrow = 3, byrow = T)
  colnames(x) <- c("coef","se")
  x
})))

# ... Add names
plot.df$name <- rep(rep(colonies,each = 3), length(add.vars))
plot.df$spec <- rep(c("(1) Baseline", "(2) + nature controls","(3) + nature + ethnic controls"), each = length(colonies)*3)
plot.df$Empire <- rep(c("British", "French", "Difference"), length(colonies)*length(add.vars))

# ... change names to current country names
plot.df$name[grep("CDI", plot.df$name)] <- "Côte d'Ivoire"
plot.df$name[grep("DAH", plot.df$name)] <- "Benin"
plot.df$name[grep("Gold Coast", plot.df$name)] <- "Ghana"
plot.df$name[grep("GUI", plot.df$name)] <- "Guinea"
plot.df$name[grep("HV", plot.df$name)] <- "Burkina Faso"
plot.df$name[grep("MAU", plot.df$name)] <- "Mauritania"
plot.df$name[grep("NIG", plot.df$name)] <- "Niger"
plot.df$name[grep("Northern Rhodesia", plot.df$name)] <- "Zimbabwe"
plot.df$name[grep("Nyasaland", plot.df$name)] <- "Malawi"
plot.df$name[grep("SEN", plot.df$name)] <- "Senegal"
plot.df$name[grep("SOU", plot.df$name)] <- "Mali"

# ... sort by name
plot.df <- plot.df[order(plot.df$name, plot.df$spec),]
plot.df$pos <- rep(c(length(colonies):1), each = length(add.vars)*3) + rep(c(.25,0,-.25), length(colonies)*length(add.vars))



# ... plot
png(paste0(fig.path,stub,".png"), width = 7, height = 5, unit = "in", res = 600)
g <- ggplot(plot.df, aes(y = pos, x = coef, color = Empire, group = Empire)) + geom_point() + 
  geom_errorbarh(aes(xmin = coef - 1.96*se, 
                     xmax = coef + 1.96*se), 
                 height=.1) +
  geom_vline(xintercept = 0, color = "darkgrey", lty = 2) +  
  scale_y_continuous(breaks=rev(c(length(colonies):1)), labels=rev(unique(plot.df$name))) + 
  facet_wrap(~ spec, nrow = 1) + xlab("Coefficient") + ylab(NULL) +
  scale_color_manual(breaks=c("British","French","Difference"),
                     values = c(`British` = "red", `French`  = "blue", `Difference`  = "black")) +
  theme(legend.position = "top")
print(g)
dev.off()

